import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
from matplotlib import pylab
import sys
import yaml
import numbers
import os
import plotly
import matplotlib.pyplot as plt
import scvelo as scv
from tqdm.notebook import tqdm
import itertools
import random
import seaborn as sns
import rpy2
import rpy2.robjects as ro
warnings.filterwarnings('ignore')
%load_ext rpy2.ipython
%matplotlib inline
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (10, 10)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
plotly.offline.init_notebook_mode()
%matplotlib inline
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (9, 9)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
hostRoot = "-".join(socket.gethostname().split('-')[0:2])
with open(os.path.expanduser('~')+"/paths_config.yaml", 'r') as f:
paths = yaml.load(f, Loader=yaml.FullLoader)
#indir=paths["paths"]["indir"][hostRoot]
outdir="./outdir/"
FinaLeaf="/Progenitors"
#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]
adata = sc.read_h5ad(outdir+FinaLeaf+"/4B_Progenitors_DA.h5ad")
adataraw = sc.read_h5ad(outdir+"/1_polaroid_mint.h5ad")[adata.obs_names]
adataraw.obs = adata.obs
adata = adataraw
sc.pp.normalize_total(adata)
normalizing counts per cell
finished (0:00:00)
adata.obs
| dataset | organoid | region | type | type_region | regionContrast | n_genes_by_counts | log1p_n_genes_by_counts | total_counts | log1p_total_counts | ... | S_score | G2M_score | phase | Progenitor | subLeiden | umap_density_type | umap_density_region | umap_density_organoid | DiffAbundant | leiden_partition_final | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACGAATCCCTTGTG-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 5025 | 8.522380 | 20194.0 | 9.913190 | ... | -0.028440 | -0.286710 | G1 | NonCyclingProgenitor | 8 | 0.412203 | 0.284801 | 0.196506 | 0 | not_enriched |
| AAGTACCCAAAGCTCT-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 1987 | 7.594884 | 3498.0 | 8.160233 | ... | -0.220022 | -0.168710 | G1 | NonCyclingProgenitor | 3 | 0.342266 | 0.389470 | 0.119090 | 0 | not_enriched |
| AAGTACCCACTGGACC-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 2253 | 7.720462 | 3527.0 | 8.168487 | ... | -0.125708 | -0.218010 | G1 | NonCyclingProgenitor | 3 | 0.689598 | 0.848694 | 0.214638 | 0 | not_enriched |
| AAGTACCTCCAATCCC-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 3634 | 8.198364 | 13578.0 | 9.516280 | ... | -0.103682 | -0.110184 | G1 | NonCyclingProgenitor | 1 | 0.731295 | 0.616268 | 0.568335 | 0 | not_enriched |
| AATAGAGAGATTAGTG-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 5001 | 8.517593 | 15610.0 | 9.655731 | ... | -0.149420 | -0.256755 | G1 | NonCyclingProgenitor | 3 | 0.764932 | 0.987658 | 0.220167 | 0 | not_enriched |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TTGTTGTAGGTCTGGA-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 4526 | 8.417815 | 13515.0 | 9.511629 | ... | -0.051561 | -0.075598 | G1 | NonCyclingProgenitor | 4 | 0.938423 | 0.915668 | 0.550702 | 1 | enriched |
| TTTCACAGTCTGTGTA-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 3570 | 8.180601 | 9301.0 | 9.137984 | ... | -0.189825 | -0.170289 | G1 | NonCyclingProgenitor | 2 | 0.328438 | 0.249450 | 0.296778 | 1 | enriched |
| TTTCAGTCAATTCACG-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 1766 | 7.477038 | 2722.0 | 7.909490 | ... | -0.181366 | -0.118886 | G1 | NonCyclingProgenitor | 0 | 0.642922 | 0.689690 | 0.197166 | 1 | enriched |
| TTTCCTCAGTCCTACA-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 2687 | 7.896553 | 7148.0 | 8.874728 | ... | -0.091811 | -0.112348 | G1 | NonCyclingProgenitor | 6 | 0.676237 | 0.641296 | 0.702468 | 1 | enriched |
| TTTGATCTCCGTTTCG-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 3964 | 8.285261 | 13345.0 | 9.498972 | ... | -0.122932 | -0.105680 | G1 | NonCyclingProgenitor | 1 | 0.387854 | 0.212226 | 0.231424 | 0 | not_enriched |
2073 rows × 30 columns
adata.write_h5ad(outdir+FinaLeaf+"/5B_Progenitors_preBulk.h5ad")
groupingCovariate = "region"
PseudooReplicates_per_group = 10
print("Pbulking with target of "+str(PseudooReplicates_per_group)+" PRs will result in following cells per PR")
adata.obs.groupby(groupingCovariate).size() / PseudooReplicates_per_group
Pbulking with target of 10 PRs will result in following cells per PR
region distal 15.7 medial 10.3 piece1 21.9 piece2 24.4 piece3 23.5 proximal 111.5 dtype: float64
total = pd.DataFrame(index = adata.var_names)
total_metadata = pd.DataFrame(index= ["_".join(Rep) for Rep in list(itertools.product(adata.obs[groupingCovariate].cat.categories.tolist(), [str(r) for r in list(range(PseudooReplicates_per_group))]))])
for group in adata.obs[groupingCovariate].cat.categories:
groupAdata = adata[adata.obs[groupingCovariate] == group]
group_cells = groupAdata.obs_names.tolist()
random.Random(4).shuffle(group_cells)
metaCellslist=[group_cells[i::PseudooReplicates_per_group] for i in range(PseudooReplicates_per_group)]
for partition in enumerate(metaCellslist):
total['{}_{}'.format(group, partition[0])] = adata[partition[1]].X.sum(axis = 0).A1
total_metadata.loc['{}_{}'.format(group, partition[0]),"group"] = group
total_metadata.loc['{}_{}'.format(group, partition[0]),"pseudoreplicate"] = partition[0]
total_metadata.loc['{}_{}'.format(group, partition[0]),"number_of_cell"] = int(len(partition[1]))
#With this line we propagate - whenever possible - obs to aggregated pReplicate
for obsMD in [obsMD for obsMD in groupAdata.obs.columns.tolist() if len(groupAdata.obs[obsMD].unique()) == 1 and obsMD != groupingCovariate]:
total_metadata.loc[["_".join(l) for l in list(itertools.product([group],[str(r) for r in list(range(PseudooReplicates_per_group))]))], obsMD ] = adata.obs.loc[adata.obs[groupingCovariate] == group,obsMD][0]
total_metadata = total_metadata.dropna(axis = 1)
adata_bulk = sc.AnnData(total.transpose()).copy()
adata_bulk.var = adata.var.copy()
adata_bulk.obs = pd.concat([adata_bulk.obs, total_metadata], axis = 1)
adata_bulk.obs["group"] =adata_bulk.obs["group"].astype("category")
with open("./colorMap.yaml", 'r') as f:
colorMap = yaml.load(f, Loader=yaml.FullLoader)["uns_colors"]
colorMap
adata_bulk.uns["group_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["group"].cat.categories]
adata_bulk.obs["regionContrast"] = adata_bulk.obs["regionContrast"].astype("category")
adata_bulk.uns["regionContrast_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["regionContrast"].cat.categories]
adata_bulk.write_h5ad(outdir+FinaLeaf+"/5B_Progenitors_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.h5ad")
total.to_csv(outdir+FinaLeaf+"/5B_Progenitors_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.tsv", sep="\t")
... storing 'type' as categorical ... storing 'type_region' as categorical ... storing 'is.Stressed' as categorical ... storing 'phase' as categorical ... storing 'Progenitor' as categorical